In [1]:
import pandas as pd
import plotly.express as px
import datetime
import requests
import json
EDA¶
Load and infomation¶
In [2]:
gasoline = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML0232EN-SkillsNetwork/asset/18100001.csv")
gasoline.head()
Out[2]:
| REF_DATE | GEO | DGUID | Type of fuel | UOM | UOM_ID | SCALAR_FACTOR | SCALAR_ID | VECTOR | COORDINATE | VALUE | STATUS | SYMBOL | TERMINATED | DECIMALS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Jan-79 | St. John's, Newfoundland and Labrador | 2011S0503001 | Regular unleaded gasoline at full service fill... | Cents per litre | 57 | units | 0 | v735046 | 2.1 | 26.0 | NaN | NaN | t | 1 |
| 1 | Jan-79 | Charlottetown and Summerside, Prince Edward Is... | NaN | Regular unleaded gasoline at full service fill... | Cents per litre | 57 | units | 0 | v735056 | 3.1 | 24.6 | NaN | NaN | t | 1 |
| 2 | Jan-79 | Halifax, Nova Scotia | 2011S0503205 | Regular unleaded gasoline at full service fill... | Cents per litre | 57 | units | 0 | v735057 | 4.1 | 23.4 | NaN | NaN | t | 1 |
| 3 | Jan-79 | Saint John, New Brunswick | 2011S0503310 | Regular unleaded gasoline at full service fill... | Cents per litre | 57 | units | 0 | v735058 | 5.1 | 23.2 | NaN | NaN | t | 1 |
| 4 | Jan-79 | Québec, Quebec | 2011S0503421 | Regular unleaded gasoline at full service fill... | Cents per litre | 57 | units | 0 | v735059 | 6.1 | 22.6 | NaN | NaN | t | 1 |
In [3]:
gasoline.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41942 entries, 0 to 41941 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 REF_DATE 41942 non-null object 1 GEO 41942 non-null object 2 DGUID 39451 non-null object 3 Type of fuel 41942 non-null object 4 UOM 41942 non-null object 5 UOM_ID 41942 non-null int64 6 SCALAR_FACTOR 41942 non-null object 7 SCALAR_ID 41942 non-null int64 8 VECTOR 41942 non-null object 9 COORDINATE 41942 non-null float64 10 VALUE 41942 non-null float64 11 STATUS 0 non-null float64 12 SYMBOL 0 non-null float64 13 TERMINATED 16564 non-null object 14 DECIMALS 41942 non-null int64 dtypes: float64(4), int64(3), object(8) memory usage: 4.8+ MB
In [4]:
gasoline.columns
Out[4]:
Index(['REF_DATE', 'GEO', 'DGUID', 'Type of fuel', 'UOM', 'UOM_ID',
'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS',
'SYMBOL', 'TERMINATED', 'DECIMALS'],
dtype='object')
In [5]:
gasoline.isnull().sum()
Out[5]:
REF_DATE 0 GEO 0 DGUID 2491 Type of fuel 0 UOM 0 UOM_ID 0 SCALAR_FACTOR 0 SCALAR_ID 0 VECTOR 0 COORDINATE 0 VALUE 0 STATUS 41942 SYMBOL 41942 TERMINATED 25378 DECIMALS 0 dtype: int64
Adjust Data¶
In [6]:
data = (gasoline[['REF_DATE','GEO','Type of fuel','VALUE']]).rename(columns={"REF_DATE" : "DATE", "Type of fuel" : "TYPE"})
data.head()
Out[6]:
| DATE | GEO | TYPE | VALUE | |
|---|---|---|---|---|
| 0 | Jan-79 | St. John's, Newfoundland and Labrador | Regular unleaded gasoline at full service fill... | 26.0 |
| 1 | Jan-79 | Charlottetown and Summerside, Prince Edward Is... | Regular unleaded gasoline at full service fill... | 24.6 |
| 2 | Jan-79 | Halifax, Nova Scotia | Regular unleaded gasoline at full service fill... | 23.4 |
| 3 | Jan-79 | Saint John, New Brunswick | Regular unleaded gasoline at full service fill... | 23.2 |
| 4 | Jan-79 | Québec, Quebec | Regular unleaded gasoline at full service fill... | 22.6 |
In [7]:
data[['City', 'Province']] = data['GEO'].str.split(',', n=1, expand=True)
data.head()
Out[7]:
| DATE | GEO | TYPE | VALUE | City | Province | |
|---|---|---|---|---|---|---|
| 0 | Jan-79 | St. John's, Newfoundland and Labrador | Regular unleaded gasoline at full service fill... | 26.0 | St. John's | Newfoundland and Labrador |
| 1 | Jan-79 | Charlottetown and Summerside, Prince Edward Is... | Regular unleaded gasoline at full service fill... | 24.6 | Charlottetown and Summerside | Prince Edward Island |
| 2 | Jan-79 | Halifax, Nova Scotia | Regular unleaded gasoline at full service fill... | 23.4 | Halifax | Nova Scotia |
| 3 | Jan-79 | Saint John, New Brunswick | Regular unleaded gasoline at full service fill... | 23.2 | Saint John | New Brunswick |
| 4 | Jan-79 | Québec, Quebec | Regular unleaded gasoline at full service fill... | 22.6 | Québec | Quebec |
In [8]:
data['DATE'] = pd.to_datetime(data['DATE'], format='%b-%y')
data['Month'] = data['DATE'].dt.month_name().str.slice(stop=3)
data['Year'] = data['DATE'].dt.year
data.head()
Out[8]:
| DATE | GEO | TYPE | VALUE | City | Province | Month | Year | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1979-01-01 | St. John's, Newfoundland and Labrador | Regular unleaded gasoline at full service fill... | 26.0 | St. John's | Newfoundland and Labrador | Jan | 1979 |
| 1 | 1979-01-01 | Charlottetown and Summerside, Prince Edward Is... | Regular unleaded gasoline at full service fill... | 24.6 | Charlottetown and Summerside | Prince Edward Island | Jan | 1979 |
| 2 | 1979-01-01 | Halifax, Nova Scotia | Regular unleaded gasoline at full service fill... | 23.4 | Halifax | Nova Scotia | Jan | 1979 |
| 3 | 1979-01-01 | Saint John, New Brunswick | Regular unleaded gasoline at full service fill... | 23.2 | Saint John | New Brunswick | Jan | 1979 |
| 4 | 1979-01-01 | Québec, Quebec | Regular unleaded gasoline at full service fill... | 22.6 | Québec | Quebec | Jan | 1979 |
Checking Type of Feature¶
In [9]:
data['GEO'].unique().tolist()
Out[9]:
["St. John's, Newfoundland and Labrador", 'Charlottetown and Summerside, Prince Edward Island', 'Halifax, Nova Scotia', 'Saint John, New Brunswick', 'Québec, Quebec', 'Montréal, Quebec', 'Ottawa-Gatineau, Ontario part, Ontario/Quebec', 'Toronto, Ontario', 'Thunder Bay, Ontario', 'Winnipeg, Manitoba', 'Regina, Saskatchewan', 'Saskatoon, Saskatchewan', 'Edmonton, Alberta', 'Calgary, Alberta', 'Vancouver, British Columbia', 'Victoria, British Columbia', 'Whitehorse, Yukon', 'Yellowknife, Northwest Territories']
In [10]:
data['TYPE'].unique().tolist()
Out[10]:
['Regular unleaded gasoline at full service filling stations', 'Regular unleaded gasoline at self service filling stations', 'Premium unleaded gasoline at full service filling stations', 'Premium unleaded gasoline at self service filling stations', 'Diesel fuel at full service filling stations', 'Household heating fuel', 'Diesel fuel at self service filling stations']
Descriptive Statistic¶
In [11]:
data['VALUE'].describe()
Out[11]:
count 41942.000000 mean 84.784858 std 31.492697 min 18.300000 25% 58.200000 50% 79.200000 75% 110.900000 max 191.600000 Name: VALUE, dtype: float64
In [12]:
group_year = data.groupby(['Year'])['VALUE'].mean()
group_year
Out[12]:
Year 1979 23.604444 1980 28.068750 1981 38.002604 1982 44.701563 1983 47.904688 1984 50.442708 1985 53.899479 1986 48.405208 1987 49.758333 1988 49.217188 1989 51.700000 1990 55.048735 1991 56.527041 1992 54.633832 1993 54.334734 1994 54.247899 1995 56.177451 1996 58.134110 1997 59.182062 1998 56.247246 1999 58.743362 2000 72.207839 2001 72.403107 2002 70.312147 2003 75.541667 2004 82.960452 2005 96.328743 2006 101.209393 2007 105.258263 2008 123.340678 2009 96.969068 2010 106.369845 2011 126.790607 2012 130.380085 2013 129.677273 2014 133.169203 2015 110.366908 2016 101.790821 2017 112.852657 2018 129.408575 2019 125.776329 2020 107.617150 2021 133.990580 Name: VALUE, dtype: float64
Visualization¶
Load file Canada Provinces to know the ID provinces¶
In [13]:
geo = requests.get("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML0232EN-SkillsNetwork/asset/canada_provinces.geojson")
In [14]:
geo_data = geo.json()
for feature in geo_data['features']:
cartodb_id = feature['properties']['cartodb_id']
name = feature['properties']['name']
print(cartodb_id, name)
6 British Columbia 5 Newfoundland and Labrador 13 Northwest Territories 12 Nunavut 3 Saskatchewan 1 Quebec 4 Alberta 10 Manitoba 2 Nova Scotia 11 Ontario 7 New Brunswick 8 Prince Edward Island 9 Yukon
Choosing specific year¶
In [15]:
one_year = data[data['Year'] == 2021]
one_year.head()
Out[15]:
| DATE | GEO | TYPE | VALUE | City | Province | Month | Year | |
|---|---|---|---|---|---|---|---|---|
| 41252 | 2021-01-01 | St. John's, Newfoundland and Labrador | Regular unleaded gasoline at self service fill... | 124.8 | St. John's | Newfoundland and Labrador | Jan | 2021 |
| 41253 | 2021-01-01 | St. John's, Newfoundland and Labrador | Premium unleaded gasoline at self service fill... | 130.6 | St. John's | Newfoundland and Labrador | Jan | 2021 |
| 41254 | 2021-01-01 | St. John's, Newfoundland and Labrador | Diesel fuel at self service filling stations | 126.7 | St. John's | Newfoundland and Labrador | Jan | 2021 |
| 41255 | 2021-01-01 | St. John's, Newfoundland and Labrador | Household heating fuel | 89.8 | St. John's | Newfoundland and Labrador | Jan | 2021 |
| 41256 | 2021-01-01 | Charlottetown and Summerside, Prince Edward Is... | Regular unleaded gasoline at self service fill... | 109.1 | Charlottetown and Summerside | Prince Edward Island | Jan | 2021 |
Create the dataframe including province and id like dataset above¶
In [16]:
geodata = one_year.groupby('Province')['VALUE'].mean().reset_index(name ='Average Gasoline Price').round(2)
provinces={' Newfoundland and Labrador':5,
' Prince Edward Island':8,
' Nova Scotia':2,
' New Brunswick':7,
' Quebec':1,
' Ontario':11,
' Ontario part, Ontario/Quebec':12,
' Manitoba':10,
' Saskatchewan':3,
' Alberta':4,
' British Columbia':6,
' Yukon':9,
' Northwest Territories':13
}
geodata['ProvinceID']=geodata['Province'].map(provinces)
geodata
Out[16]:
| Province | Average Gasoline Price | ProvinceID | |
|---|---|---|---|
| 0 | Alberta | 130.48 | 4 |
| 1 | British Columbia | 151.17 | 6 |
| 2 | Manitoba | 127.48 | 10 |
| 3 | New Brunswick | 128.35 | 7 |
| 4 | Newfoundland and Labrador | 135.54 | 5 |
| 5 | Northwest Territories | 136.13 | 13 |
| 6 | Nova Scotia | 123.54 | 2 |
| 7 | Ontario | 140.85 | 11 |
| 8 | Ontario part, Ontario/Quebec | 135.79 | 12 |
| 9 | Prince Edward Island | 123.80 | 8 |
| 10 | Quebec | 131.44 | 1 |
| 11 | Saskatchewan | 125.89 | 3 |
| 12 | Yukon | 141.50 | 9 |
In [17]:
mp = json.loads(geo.text)
fig = px.choropleth(geodata,
locations="ProvinceID",
geojson=mp,
featureidkey="properties.cartodb_id",
color="Average Gasoline Price",
color_continuous_scale=px.colors.sequential.YlOrRd, # px.colors.diverging.Tropic
scope='north america',
title='<b>Average Gasoline Price </b>',
hover_name='Province',
hover_data={
'Average Gasoline Price': True,
'ProvinceID': False
},
locationmode='geojson-id',
)
fig.update_layout(
showlegend=True,
legend_title_text='<b>Average Gasoline Price</b>',
font={"size": 16, "color": "#808080", "family" : "calibri"},
margin={"r":0,"t":40,"l":0,"b":0},
legend=dict(orientation='v'),
geo=dict(bgcolor='rgba(0,0,0,0)', lakecolor='#e0fffe')
)
fig.update_geos(showcountries=False, showcoastlines=False,
showland= False, fitbounds="locations",
subunitcolor='white')
fig.show()